csc3007-cyan2

Author

ONG ZHEN YU BRANDON, GOH BOON CHUN, CHNG JUN JIE JEREMY, EISEN REINER BAGUILAT PERDIDO, MAGESHWARAN SO MUTHUSAMY

1 Importing of libraries

library(readr)
library(ggplot2)
library(e1071)
library(readxl)
library(gridExtra)
library(tidyverse)
library(countrycode)
library(sf)
library(tmap)
library(lintr)
library(plotly)
library(MASS) # for boxcox normalisation
library(rmapshaper)
library(mapview)
library(leaflet)
library(RColorBrewer)
library(DT)
library(GGally)
library(dplyr)
library(psych)
library(knitr)
library(factoextra)
library(GGally)
library(data.table)
library(rpart)
library(rpart.plot)
library(rattle)
library(caret)

2 Selecting the indicators

#mortality_rate_attributed_to_unsafe_water
mortality_rate_unsafe_water <- read_csv("data/indicator_3.9.2.csv",show_col_types = FALSE)

#Proportion_of_population_using_safely_managed_drinking_water_services
proportion_of_safe_water<- read_csv("data/indicator_6.1.1.csv",show_col_types = FALSE)

#mortality_rate_unintentional_poisoning
mortality_rate_unintentional_poisoning <- read_csv("data/indicator_3.9.3.csv",show_col_types = FALSE)

#proportion-of-population-with-basic-handwashing-facilities-on-premises-by-urban-rural-percent
population_with_basic_handwashing_facilities<- read_csv("data/indicator_6.2.1.csv",show_col_types = FALSE)

3 Map of Indicator 1

3.1 Select specific columns for mortality rate unintentional poisoning

mortality_rate_unintentional_poisoning <- dplyr::select(
  mortality_rate_unintentional_poisoning,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  gender = 'sex_desc',
  mortality_rate_unintentional_poisoning_2019 = 'value_2019'
)

# drop rows if contains at least one NA value
mortality_rate_unintentional_poisoning <- na.omit(mortality_rate_unintentional_poisoning)

#check for empty row
any(is.na(mortality_rate_unintentional_poisoning))
[1] FALSE
mortality_rate_unintentional_poisoning
# A tibble: 597 × 6
    code country     region                  iso   gender mortality_rate_unint…¹
   <dbl> <chr>       <chr>                   <chr> <chr>                   <dbl>
 1   470 Malta       Southern Europe         MLT   Both …                    0.1
 2   152 Chile       South America           CHL   Male                      0.5
 3     4 Afghanistan Southern Asia (excludi… AFG   Both …                    1  
 4   470 Malta       Southern Europe         MLT   Female                    0  
 5   156 China       Eastern Asia            CHN   Both …                    1.8
 6     4 Afghanistan Southern Asia (excludi… AFG   Both …                    1  
 7   470 Malta       Southern Europe         MLT   Male                      0.2
 8   156 China       Eastern Asia            CHN   Female                    1.5
 9   478 Mauritania  Western Africa          MRT   Both …                    1.5
10   478 Mauritania  Western Africa          MRT   Female                    1.3
# ℹ 587 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019

3.2 Histogram of mortality rate unintential poisoning for year 2019

# Create the histogram
mortality_histo <- ggplot(mortality_rate_unintentional_poisoning, aes(x=mortality_rate_unintentional_poisoning_2019)) +
  geom_histogram(binwidth=0.1, fill='blue', color='black') +
  labs(title="Mortality rate unintentional poisoning for Year 2019", x="Mortality Rate (per 100,000)", y="Number of Country")

mortality_histo

3.3 Select specific columns for population with basic handwashing facilities

population_with_basic_handwashing_facilities <- dplyr::select(
  population_with_basic_handwashing_facilities,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  area = 'location_desc',
  pop_with_basic_handwashing_facilites_2019 = 'value_2019'
)

# drop rows if contains at least one NA value
population_with_basic_handwashing_facilities <- na.omit(population_with_basic_handwashing_facilities)

#check for empty row
any(is.na(mortality_rate_unintentional_poisoning))
[1] FALSE
population_with_basic_handwashing_facilities
# A tibble: 284 × 6
    code country      region                  iso   area  pop_with_basic_handw…¹
   <dbl> <chr>        <chr>                   <chr> <chr>                  <dbl>
 1     4 Afghanistan  Southern Asia (excludi… AFG   All …                     38
 2   356 India        Southern Asia           IND   Urban                     82
 3     4 Afghanistan  Southern Asia (excludi… AFG   All …                     38
 4   360 Indonesia    South-Eastern Asia      IDN   All …                     94
 5     4 Afghanistan  Southern Asia (excludi… AFG   Rural                     29
 6   694 Sierra Leone Western Africa          SLE   Rural                     18
 7   360 Indonesia    South-Eastern Asia      IDN   Rural                     91
 8     4 Afghanistan  Southern Asia (excludi… AFG   Rural                     29
 9     4 Afghanistan  Southern Asia (excludi… AFG   Urban                     64
10     4 Afghanistan  Southern Asia (excludi… AFG   Urban                     64
# ℹ 274 more rows
# ℹ abbreviated name: ¹​pop_with_basic_handwashing_facilites_2019

3.4 Histogram of population with basic handwashing facilities for year 2019

# Create the histogram
pop_with_handwashing_facilities_histo <- ggplot(population_with_basic_handwashing_facilities, aes(x=pop_with_basic_handwashing_facilites_2019)) +
  geom_histogram(binwidth=5, fill='blue', color='black') +
  labs(title="Population with basic handwashing facilities for Year 2019", x="Population (%)", y="Number of Country")

pop_with_handwashing_facilities_histo

3.5 Left join between mortality rate and basic handwashing

countries_tb <- left_join(
  mortality_rate_unintentional_poisoning,
  population_with_basic_handwashing_facilities,
  by = join_by(country, iso)
) |>
  mutate(
    iso = case_match(
      iso,
      "COD" ~ "ZAR",
      "ROU" ~ "ROM",
      "TLS" ~ "TMP",
      "XKX" ~ "KSV",
      .default = iso
    )
  )
Warning in left_join(mortality_rate_unintentional_poisoning, population_with_basic_handwashing_facilities, : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 3 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
countries_tb
# A tibble: 1,323 × 10
   code.x country   region.x iso   gender mortality_rate_unint…¹ code.y region.y
    <dbl> <chr>     <chr>    <chr> <chr>                   <dbl>  <dbl> <chr>   
 1    470 Malta     Souther… MLT   Both …                    0.1     NA <NA>    
 2    152 Chile     South A… CHL   Male                      0.5     NA <NA>    
 3      4 Afghanis… Souther… AFG   Both …                    1        4 Souther…
 4      4 Afghanis… Souther… AFG   Both …                    1        4 Souther…
 5      4 Afghanis… Souther… AFG   Both …                    1        4 Souther…
 6      4 Afghanis… Souther… AFG   Both …                    1        4 Souther…
 7      4 Afghanis… Souther… AFG   Both …                    1        4 Souther…
 8      4 Afghanis… Souther… AFG   Both …                    1        4 Souther…
 9    470 Malta     Souther… MLT   Female                    0       NA <NA>    
10    156 China     Eastern… CHN   Both …                    1.8     NA <NA>    
# ℹ 1,313 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: area <chr>,
#   pop_with_basic_handwashing_facilites_2019 <dbl>

3.6 Import map

countries_sf <- read_sf("./data/WB_countries_Admin0.geojson")
land_sf <- read_sf("./data/WB_Land.geojson")
countries_sf <-
  countries_sf |>
  ms_simplify() |> # Default argument `keep = 0.05`
  st_make_valid()
land_sf <- ms_simplify(land_sf)
npts(countries_sf)
[1] 30826
countries_sf <-
  countries_sf |>
  left_join(
    countries_tb,
    by = join_by(WB_A3 == iso)
  ) |>
  dplyr::select(country, continent = 'CONTINENT', gender = 'gender', area = 'area', iso = WB_A3, mortality_rate_unintentional_poisoning_2019, pop_with_basic_handwashing_facilites_2019)
countries_sf
Simple feature collection with 1246 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -180 ymin: -55.62754 xmax: 179.9038 ymax: 83.6341
Geodetic CRS:  WGS 84
# A tibble: 1,246 × 8
   country   continent gender     area      iso   mortality_rate_unintentional…¹
   <chr>     <chr>     <chr>      <chr>     <chr>                          <dbl>
 1 Indonesia Asia      Both sexes All areas IDN                              0.3
 2 Indonesia Asia      Both sexes Rural     IDN                              0.3
 3 Indonesia Asia      Both sexes Urban     IDN                              0.3
 4 Indonesia Asia      Female     All areas IDN                              0.1
 5 Indonesia Asia      Female     Rural     IDN                              0.1
 6 Indonesia Asia      Female     Urban     IDN                              0.1
 7 Indonesia Asia      Male       All areas IDN                              0.5
 8 Indonesia Asia      Male       Rural     IDN                              0.5
 9 Indonesia Asia      Male       Urban     IDN                              0.5
10 Malaysia  Asia      Both sexes <NA>      MYS                              0.7
# ℹ 1,236 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
polygon <- st_polygon(x = list(rbind(
  c(-180.0001, 90),
  c(-179.9999, 90),
  c(-179.9999, -90),
  c(-180.0001, -90),
  c(-180.0001, 90)
))) |>
  st_sfc() |>
  st_set_crs(4326) # Equirectangular projection
countries_sf <- mutate(
  countries_sf,
  geometry = st_difference(geometry, polygon)
) |>
  st_make_valid()
tm_shape(countries_sf) + tm_polygons()

3.7 Map of population with basic handwashing facilities for year 2019

countries_sf$area <- replace_na(countries_sf$area, "missing")
choropleth_pop <-
  tm_shape(land_sf, projection = "ESRI:54012") +
  tm_polygons(col = "grey") +
  tm_shape(countries_sf) +
  tm_polygons(
    col = "pop_with_basic_handwashing_facilites_2019",
    border.col = "grey30",
    palette = "Greens",
    breaks = c(-Inf, 20, 40, 60, 80, 100, Inf),
    colorNA = "grey",
    title = "Population (%)",
    lwd = 0.5
  ) +
  tm_text("iso", size = "AREA") +  # Use the new column to set the size
  tm_credits(
    c("", "Source: unstats-undesa.opendata.arcgis.com"),
    position = c("right", "bottom")
  ) +
  tm_layout(
    bg.color = "lightblue",
    frame = FALSE,
    inner.margins = c(0.00, 0.2, 0.2, 0.1),
    earth.boundary = TRUE,
    space.color = "white",
    main.title.size = 0.8,
    title.size = 0.5,
    main.title = "Population with basic handwashing facilities by Country for 2019"
  )

choropleth_pop

3.8 Map of mortality rate unintentional poisoning for year 2019

countries_sf$gender <- replace_na(countries_sf$gender, "missing")
choropleth_mor<-
  tm_shape(land_sf, projection = "ESRI:54012") +
  tm_polygons(col = "grey") +
  tm_shape(countries_sf) +
  tm_polygons(
    col = "mortality_rate_unintentional_poisoning_2019",
    border.col = "grey30",
    palette = "Oranges",
    breaks = c(-Inf, 0.2, 0.4, 0.6, 0.8, 1, Inf),
    colorNA = "grey",
    title = "Mortality Rate \nper 100,000",
    lwd = 0.5
  ) + 
  tm_text("iso", size = "AREA") +
  tm_credits(
    c("", "Source: unstats-undesa.opendata.arcgis.com"),
    position = c("right", "bottom")
  ) +
  tm_layout(
    bg.color = "lightblue",
    frame = FALSE,
    inner.margins = c(0.00, 0.2, 0.2, 0.1),
    earth.boundary = TRUE,
    space.color = "white",
    main.title.size = 0.8,
    title.size = 0.5,
    main.title = "Mortality Rate Unintentional Poisoning by Country for 2019"
  )

choropleth_mor

3.9 Filtering of areas

countries_sf_all_areas <- countries_sf[countries_sf$area == "All areas",]
countries_sf_urban <- countries_sf[countries_sf$area == "Urban",]
countries_sf_rural <- countries_sf[countries_sf$area == "Rural",]

countries_sf_all_areas
Simple feature collection with 324 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -172.7497 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS:  WGS 84
# A tibble: 324 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
   <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Both … All … IDN                      0.3
 2 Indonesia                 Asia      Female All … IDN                      0.1
 3 Indonesia                 Asia      Male   All … IDN                      0.5
 4 Bolivia (Plurinational S… South Am… Both … All … BOL                      0.6
 5 Bolivia (Plurinational S… South Am… Female All … BOL                      0.5
 6 Bolivia (Plurinational S… South Am… Male   All … BOL                      0.6
 7 India                     Asia      Both … All … IND                      0.3
 8 India                     Asia      Female All … IND                      0.2
 9 India                     Asia      Male   All … IND                      0.3
10 Ethiopia                  Africa    Both … All … ETH                      3.3
# ℹ 314 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
countries_sf_urban
Simple feature collection with 306 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -157.449 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS:  WGS 84
# A tibble: 306 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
   <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Both … Urban IDN                      0.3
 2 Indonesia                 Asia      Female Urban IDN                      0.1
 3 Indonesia                 Asia      Male   Urban IDN                      0.5
 4 Bolivia (Plurinational S… South Am… Both … Urban BOL                      0.6
 5 Bolivia (Plurinational S… South Am… Female Urban BOL                      0.5
 6 Bolivia (Plurinational S… South Am… Male   Urban BOL                      0.6
 7 India                     Asia      Both … Urban IND                      0.3
 8 India                     Asia      Female Urban IND                      0.2
 9 India                     Asia      Male   Urban IND                      0.3
10 Ethiopia                  Africa    Both … Urban ETH                      3.3
# ℹ 296 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
countries_sf_rural
Simple feature collection with 309 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -157.449 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS:  WGS 84
# A tibble: 309 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
   <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Both … Rural IDN                      0.3
 2 Indonesia                 Asia      Female Rural IDN                      0.1
 3 Indonesia                 Asia      Male   Rural IDN                      0.5
 4 Bolivia (Plurinational S… South Am… Both … Rural BOL                      0.6
 5 Bolivia (Plurinational S… South Am… Female Rural BOL                      0.5
 6 Bolivia (Plurinational S… South Am… Male   Rural BOL                      0.6
 7 Peru                      South Am… Both … Rural PER                      0.4
 8 Peru                      South Am… Female Rural PER                      0.3
 9 Peru                      South Am… Male   Rural PER                      0.5
10 India                     Asia      Both … Rural IND                      0.3
# ℹ 299 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
# Get a color palette with three colors
colors <- brewer.pal(3, "Set1")

# Assign colors to the categories
colorpal_area <- colorFactor(colors, 
                               levels = c("All areas", "Urban", "Rural"))

3.10 Map of population with basic handwashing facilities for all areas

leaflet(countries_sf_all_areas) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorpal_area(area),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~iso,  
              popup = ~paste("Country Code: ", iso,
                             "<br>Country: ", country,
                             "<br>Population with basic handwashing facilities (%): ", pop_with_basic_handwashing_facilites_2019,
                             "<br>Area: ", area)) %>%
  addLegend(pal = colorpal_area, 
            values = ~area, 
            title = "Area", 
            position = "bottomright")

3.11 Map of population with basic handwashing facilities for urban areas

leaflet(countries_sf_urban) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorpal_area(area),  # use colorpal_area here
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~iso,  # This adds labels that appear when you hover over a country
              popup = ~paste("Country Code: ", iso,
                             "<br>Country: ", country,
                             "<br>Population with basic handwashing facilities (%): ", pop_with_basic_handwashing_facilites_2019,
                             "<br>Area: ", area)) %>%
  addLegend(pal = colorpal_area, 
            values = ~area, 
            title = "Area", 
            position = "bottomright")

3.12 Map of population with basic handwashing facilities for rural areas

leaflet(countries_sf_rural) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorpal_area(area),  # use colorpal_area here
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~iso,  # This adds labels that appear when you hover over a country
              popup = ~paste("Country Code: ", iso,
                             "<br>Country: ", country,
                             "<br>Population with basic handwashing facilities (%): ", pop_with_basic_handwashing_facilites_2019,
                             "<br>Area: ", area)) %>%
  addLegend(pal = colorpal_area, 
            values = ~area, 
            title = "Area", 
            position = "bottomright")

3.13 Filtering of genders

countries_sf_male <- countries_sf[countries_sf$gender == "Male",]
countries_sf_female <- countries_sf[countries_sf$gender == "Female",]
countries_sf_both <- countries_sf[countries_sf$gender == "Both sexes",]

countries_sf_male
Simple feature collection with 410 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.11652
Geodetic CRS:  WGS 84
# A tibble: 410 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
   <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Male   All … IDN                      0.5
 2 Indonesia                 Asia      Male   Rural IDN                      0.5
 3 Indonesia                 Asia      Male   Urban IDN                      0.5
 4 Malaysia                  Asia      Male   miss… MYS                      1  
 5 Chile                     South Am… Male   miss… CHL                      0.5
 6 Bolivia (Plurinational S… South Am… Male   All … BOL                      0.6
 7 Bolivia (Plurinational S… South Am… Male   Rural BOL                      0.6
 8 Bolivia (Plurinational S… South Am… Male   Urban BOL                      0.6
 9 Peru                      South Am… Male   Rural PER                      0.5
10 Argentina                 South Am… Male   miss… ARG                      0.5
# ℹ 400 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
countries_sf_female
Simple feature collection with 410 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.11652
Geodetic CRS:  WGS 84
# A tibble: 410 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
   <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Female All … IDN                      0.1
 2 Indonesia                 Asia      Female Rural IDN                      0.1
 3 Indonesia                 Asia      Female Urban IDN                      0.1
 4 Malaysia                  Asia      Female miss… MYS                      0.3
 5 Chile                     South Am… Female miss… CHL                      0.2
 6 Bolivia (Plurinational S… South Am… Female All … BOL                      0.5
 7 Bolivia (Plurinational S… South Am… Female Rural BOL                      0.5
 8 Bolivia (Plurinational S… South Am… Female Urban BOL                      0.5
 9 Peru                      South Am… Female Rural PER                      0.3
10 Argentina                 South Am… Female miss… ARG                      0.3
# ℹ 400 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
countries_sf_both
Simple feature collection with 410 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.11652
Geodetic CRS:  WGS 84
# A tibble: 410 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
   <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Both … All … IDN                      0.3
 2 Indonesia                 Asia      Both … Rural IDN                      0.3
 3 Indonesia                 Asia      Both … Urban IDN                      0.3
 4 Malaysia                  Asia      Both … miss… MYS                      0.7
 5 Chile                     South Am… Both … miss… CHL                      0.4
 6 Bolivia (Plurinational S… South Am… Both … All … BOL                      0.6
 7 Bolivia (Plurinational S… South Am… Both … Rural BOL                      0.6
 8 Bolivia (Plurinational S… South Am… Both … Urban BOL                      0.6
 9 Peru                      South Am… Both … Rural PER                      0.4
10 Argentina                 South Am… Both … miss… ARG                      0.4
# ℹ 400 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>
# Get a color palette with three colors
colors <- brewer.pal(3, "Set1")

# Assign colors to the categories
colorpal_gender <- colorFactor(colors, 
                               levels = c("Female", "Male", "Both sexes"))

3.14 Map of mortality rate unintentional poisoning for male gender

# Replace NAs in 'gender' column
countries_sf_male$gender <- replace_na(countries_sf_male$gender, "missing")

leaflet(countries_sf_male) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorpal_gender(gender),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~iso,  
              popup = ~paste("Country Code: ", iso,
                             "<br>Country: ", country,
                             "<br>Mortality Rate Unintentional Poisoning per 100,000: ", mortality_rate_unintentional_poisoning_2019,
                             "<br>Gender: ", gender)) %>%
  addLegend(pal = colorpal_gender, 
            values = ~gender, 
            title = "Gender", 
            position = "bottomright")

3.15 Map of mortality rate unintentional poisoning for female gender

# Replace NAs in 'gender' column
countries_sf_female$gender <- replace_na(countries_sf_female$gender, "missing")

leaflet(countries_sf_female) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorpal_gender(gender),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~iso,  
              popup = ~paste("Country Code: ", iso,
                             "<br>Country: ", country,
                             "<br>Mortality Rate Unintentional Poisoning per 100,000: ", mortality_rate_unintentional_poisoning_2019,
                             "<br>Gender: ", gender)) %>%
  addLegend(pal = colorpal_gender, 
            values = ~gender, 
            title = "Gender", 
            position = "bottomright")

3.16 Map of mortality rate unintentional poisoning for both gender

# Replace NAs in 'gender' column
countries_sf_both$gender <- replace_na(countries_sf_both$gender, "missing")

leaflet(countries_sf_both) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorpal_gender(gender),  # Assign color based on gender
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~iso,  
              popup = ~paste("Country Code: ", iso,
                             "<br>Country: ", country,
                             "<br>Mortality Rate Unintentional Poisoning per 100,000: ", mortality_rate_unintentional_poisoning_2019,
                             "<br>Gender: ", gender)) %>%
  addLegend(pal = colorpal_gender, 
            values = ~gender, 
            title = "Gender", 
            position = "bottomright")

4 Plots of Indicator between mortality rate and population with basic handwashing facilities

countries_sf <- countries_sf[!is.na(countries_sf$mortality_rate_unintentional_poisoning_2019) & !is.na(countries_sf$pop_with_basic_handwashing_facilites_2019), ]

countries_sf <- countries_sf %>%
  filter(gender == 'Both sexes')

countries_sf
Simple feature collection with 313 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -172.7497 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS:  WGS 84
# A tibble: 313 × 8
   country                   continent gender area  iso   mortality_rate_unint…¹
 * <chr>                     <chr>     <chr>  <chr> <chr>                  <dbl>
 1 Indonesia                 Asia      Both … All … IDN                      0.3
 2 Indonesia                 Asia      Both … Rural IDN                      0.3
 3 Indonesia                 Asia      Both … Urban IDN                      0.3
 4 Bolivia (Plurinational S… South Am… Both … All … BOL                      0.6
 5 Bolivia (Plurinational S… South Am… Both … Rural BOL                      0.6
 6 Bolivia (Plurinational S… South Am… Both … Urban BOL                      0.6
 7 Peru                      South Am… Both … Rural PER                      0.4
 8 India                     Asia      Both … Urban IND                      0.3
 9 India                     Asia      Both … All … IND                      0.3
10 India                     Asia      Both … Rural IND                      0.3
# ℹ 303 more rows
# ℹ abbreviated name: ¹​mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
#   geometry <MULTIPOLYGON [°]>

4.1 Correlation

# Plot the data
plot_loess <- ggplot(countries_sf, aes(x = mortality_rate_unintentional_poisoning_2019, y = pop_with_basic_handwashing_facilites_2019)) +
  
  geom_point(aes(color = area, 
                 shape = area,
                 text = paste("Country:", country, "<br>", "Location:", area)), alpha = 0.7) +
  
  geom_smooth(data = subset(countries_sf, area == "All areas"),
              method = loess, se = FALSE,
              aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
              show.legend = TRUE) +
  
  geom_smooth(data = subset(countries_sf, area == "Urban"),
              method = loess, se = FALSE,
              aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
              show.legend = TRUE) +

  geom_smooth(data = subset(countries_sf, area == "Rural"),
              method = loess, se = FALSE,
              aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
              show.legend = TRUE) +
  
  labs(x = "Mortality Rate (deaths per 100,000 population) for 2019 ",
       y = "Population with Basic Handwashing Facilities for 2019 (%)",
       title = "Correlation between Mortality Rate and Basic Handwashing Facilities \n for Both Gender",
       color = "Location Type",
       shape = "Location Type") +
  
  theme_minimal() +
  
  guides(color = guide_legend(override.aes = list(shape = 16)),
         shape = guide_legend(override.aes = list(color = "black")))
Warning in geom_point(aes(color = area, shape = area, text = paste("Country:",
: Ignoring unknown aesthetics: text
# Convert to a plotly plot
gp_map_1 <- ggplotly(plot_loess, tooltip = "text")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
gp_map_1
correlation_all <- cor(countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "All areas"], 
                       countries_sf$mortality_rate_unintentional_poisoning_2019[countries_sf$area == "All areas"])

correlation_urban <- cor(countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Urban"], 
                         countries_sf$mortality_rate_unintentional_poisoning_2019[countries_sf$area == "Urban"])

correlation_rural <- cor(countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Rural"], 
                         countries_sf$mortality_rate_unintentional_poisoning_2019[countries_sf$area == "Rural"])

blank_plot <- ggplot() +
  theme_void() +
  theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
  annotate("text",
           x = 0.5,
           y = 0.5,
           label = paste("Correlation All:",
                         round(correlation_all, 2), "\n",
                         "Correlation Urban:",
                         round(correlation_urban, 2), "\n",
                         "Correlation Rural:",
                         round(correlation_rural, 2)),
           size = 5,
           color = "black",
           hjust = 0.5)

# Convert to a plotly plot
gp_text_map_1 <- ggplotly(blank_plot)

gp_text_map_1
stacked_bar_chart <- ggplot(countries_sf, aes(x = mortality_rate_unintentional_poisoning_2019, fill = area)) +
  geom_bar(position = "stack") +
  labs(x = "Mortality Rate (deaths per 100,000 population)",
       y = "Count",
       title = "Stacked Bar Chart of Mortality Rates") +
  theme_minimal()

# Convert to a plotly plot
ggplot_stacked_bar_chart <- ggplotly(stacked_bar_chart)

# Create a grouped bar chart
grouped_bar_chart <- ggplot(countries_sf, aes(x = mortality_rate_unintentional_poisoning_2019, fill = area)) +
  geom_bar(position = "dodge") +
  labs(x = "Mortality Rate (deaths per 100,000 population)",
       y = "Count",
       title = "Grouped Bar Chart of Mortality Rates") +
  theme_minimal()

# Convert to a plotly plot
ggplot_grouped_bar_chart <- ggplotly(grouped_bar_chart)

4.2 Barchart and stacked barchart (without Transformation)

ggplot_stacked_bar_chart <- ggplotly(stacked_bar_chart, width = 1000, height = 600, autosize = TRUE)
ggplot_grouped_bar_chart <- ggplotly(grouped_bar_chart, width = 1000, height = 600, autosize = TRUE)

ggplot_stacked_bar_chart
ggplot_grouped_bar_chart

4.3 Normalise Data

#Ensure you have the MASS library installed

countries_sf$mortality_rate_unintentional_poisoning_2019.positive <- countries_sf$mortality_rate_unintentional_poisoning_2019 + abs(min(countries_sf$mortality_rate_unintentional_poisoning_2019)) + 0.1
# Apply the Box-Cox transformation
bc_result <- MASS::boxcox(countries_sf$mortality_rate_unintentional_poisoning_2019.positive ~ 1, 
                    lambda = seq(-3,3,0.1))

# The optimal lambda value is the one that maximizes the log-likelihood
optimal_lambda <- bc_result$x[which.max(bc_result$y)]

# Transform the data using the optimal lambda value
if (optimal_lambda == 0) {
  countries_sf$latest_value.x_bc <- log(countries_sf$mortality_rate_unintentional_poisoning_2019.positive)
} else {
  countries_sf$latest_value.x_bc <- (countries_sf$mortality_rate_unintentional_poisoning_2019.positive^optimal_lambda - 1) / optimal_lambda
}

# Shift the transformed variable to be non-negative
min_value <- min(countries_sf$latest_value.x_bc)
countries_sf$latest_value.x_bc <- countries_sf$latest_value.x_bc + abs(min_value) + 0.1

4.4 Histogram (Normalised)

# Round the transformed mortality rates to the nearest whole number
countries_sf$latest_value.x_bc_rounded <- round(countries_sf$latest_value.x_bc)

# Convert the latest_value.x_bc_rounded variable into a categorical variable
countries_sf$latest_value.x_bc_cat <- cut(countries_sf$latest_value.x_bc_rounded, breaks = 50)

# Create a stacked bar chart
stacked_bar_chart_bc <- ggplot(countries_sf, aes(x = latest_value.x_bc_rounded, fill = area)) +
  geom_bar(position = "stack", bins =20) +
  labs(x = "Transformed Mortality Rate",
       y = "Count",
       title = "Stacked Bar Chart of Transformed Mortality Rates") +
  theme_minimal()
Warning in geom_bar(position = "stack", bins = 20): Ignoring unknown
parameters: `bins`
ggplot_stacked_bar_chart_bc <- ggplotly(stacked_bar_chart_bc)
ggplot_stacked_bar_chart_bc
# Create a grouped bar chart
grouped_bar_chart_bc <- ggplot(countries_sf, aes(x = latest_value.x_bc_rounded, fill = area)) +
  geom_bar(position = "dodge") +
  labs(x = "Transformed Mortality Rate",
       y = "Count",
       title = "Grouped Bar Chart of Transformed Mortality Rates") +
  theme_minimal()

# Convert to a plotly plot
ggplot_grouped_bar_chart_bc <- ggplotly(grouped_bar_chart_bc)
ggplot_grouped_bar_chart_bc

4.5 Scatter plot Normalised

plot_loess <- ggplot(countries_sf, aes(x = latest_value.x_bc, y = pop_with_basic_handwashing_facilites_2019)) +
  
  geom_point(aes(color = area, 
                 shape = area,
                 text = paste("Country:", country, "<br>", "Location:", area)), alpha = 0.7) +
  
  geom_smooth(data = subset(countries_sf, area == "All areas"),
              method = loess, se = FALSE, 
              aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
              show.legend = TRUE) +
  
  geom_smooth(data = subset(countries_sf, area == "Urban"),
              method = loess,
              se = FALSE,
              aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
              show.legend = TRUE) +

  geom_smooth(data = subset(countries_sf, area == "Rural"),
              method = loess,
              se = FALSE,
              aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
              show.legend = TRUE) +
  
  labs(x = "Transformed Mortality Rate (deaths per 100,000 population) for 2019",
       y = "Population with Basic Handwashing Facilities for 2019 (%)",
       title = "Correlation between Mortality Rate and Basic Handwashing Facilities (Normalised) for Both Gender",
       color = "Location Type",
       shape = "Location Type") +
  
  theme_minimal() +
  
  guides(color = guide_legend(override.aes = list(shape = 16)),
         shape = guide_legend(override.aes = list(color = "black")))
Warning in geom_point(aes(color = area, shape = area, text = paste("Country:",
: Ignoring unknown aesthetics: text
gp_normalised_map_1 <- ggplotly(plot_loess, tooltip = "text")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
gp_normalised_map_1

4.6 Correlation Normalised

correlation_all <- cor(countries_sf$latest_value.x_bc[countries_sf$area == "All areas"], 
                       countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "All areas"])

correlation_urban <- cor(countries_sf$latest_value.x_bc[countries_sf$area == "Urban"], 
                         countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Urban"])

correlation_rural <- cor(countries_sf$latest_value.x_bc[countries_sf$area == "Rural"], 
                         countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Rural"])

blank_plot <- ggplot() +
  theme_void() +
  theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
  annotate("text",
           x = 0.5,
           y = 0.5,
           label = paste("Correlation All:",
                         round(correlation_all, 2), "\n",
                         "Correlation Urban:",
                         round(correlation_urban, 2), "\n",
                         "Correlation Rural:",
                         round(correlation_rural, 2)),
           size = 5,
           color = "black",
           hjust = 0.5)

gp_text_normalised_map_1 <- ggplotly(blank_plot)
gp_text_normalised_map_1

4.7 Combining normal and unormalised plots

combined_plot_map_1 <- subplot(
  gp_map_1, gp_text_map_1, 
  gp_normalised_map_1, gp_text_normalised_map_1, 
  nrows = 2, margin = 0.05
)

# Print the combined plot
combined_plot_map_1

5 Map of Indicator 2

6 Retrieve Data

6.1 Mortality Rate due to Unsafe Water

mortality_rate_unsafe_water_map <- dplyr::select(
  mortality_rate_unsafe_water,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  mortality_rate_unsafe_water = 'latest_value'
)

mortality_rate_unsafe_water_map <- mortality_rate_unsafe_water_map%>%
  distinct()

mortality_rate_unsafe_water_map
# A tibble: 183 × 5
    code country                          region    iso   mortality_rate_unsaf…¹
   <dbl> <chr>                            <chr>     <chr>                  <dbl>
 1   583 Micronesia (Federated States of) Oceania … FSM                      3.6
 2   496 Mongolia                         Eastern … MNG                      1.3
 3   499 Montenegro                       Southern… MNE                      0  
 4   504 Morocco                          Northern… MAR                      1.9
 5   508 Mozambique                       Eastern … MOZ                     27.6
 6   104 Myanmar                          South-Ea… MMR                     12.6
 7   516 Namibia                          Southern… NAM                     18.3
 8   524 Nepal                            Southern… NPL                     19.8
 9   528 Netherlands                      Western … NLD                      0.2
10   554 New Zealand                      Australi… NZL                      0.1
# ℹ 173 more rows
# ℹ abbreviated name: ¹​mortality_rate_unsafe_water

6.2 Proportion of safe water management

proportion_of_safe_water_map <- dplyr::select(
  proportion_of_safe_water,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  urbanisation = "location_desc",
  value_2019 = "value_2019"
)

proportion_of_safe_water_map <- proportion_of_safe_water_map%>%
  distinct()

proportion_of_safe_water_map
# A tibble: 286 × 6
    code country                            region iso   urbanisation value_2019
   <dbl> <chr>                              <chr>  <chr> <chr>             <dbl>
 1   807 North Macedonia                    South… MKD   Rural                66
 2   804 Ukraine                            Easte… UKR   Urban                89
 3   807 North Macedonia                    South… MKD   Urban                85
 4   826 United Kingdom of Great Britain a… North… GBR   All areas           100
 5   580 Northern Mariana Islands           Ocean… MNP   All areas            91
 6   840 United States of America           North… USA   All areas            97
 7   578 Norway                             North… NOR   All areas            99
 8   512 Oman                               Weste… OMN   All areas            90
 9   586 Pakistan                           South… PAK   All areas            36
10   586 Pakistan                           South… PAK   Rural                33
# ℹ 276 more rows

7 Cleaning and fixing geometry data

countries_geom <- read_sf("./data/WB_countries_Admin0.geojson")
countries_boundary <- st_boundary(countries_geom)
countries_sf_simplified <- ms_simplify(countries_boundary, keep = 0.05)
countries_sf_valid <- st_make_valid(countries_sf_simplified)
countries_wrapped <- st_wrap_dateline(countries_sf_valid)
polygon <- st_polygon(x = list(rbind(c(-180.0001, 90),
                                     c(-179.9999, 90),
                                     c(-179.9999, -90),
                                     c(-180.0001, -90),
                                     c(-180.0001, 90)))) |>
  st_sfc() |>
  st_set_crs(4326)

countries_wrapped <- countries_wrapped |>
  st_difference(polygon)
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
land_sf <- read_sf("data/WB_Land.geojson")
land_boundary <- st_boundary(land_sf)
land_sf_simplified <- ms_simplify(land_boundary, keep = 0.05)
land_sf_valid <- st_make_valid(land_sf_simplified)
land_wrapped <- st_wrap_dateline(land_sf_valid)
polygon <- st_polygon(x = list(rbind(c(-180.0001, 90),
                                     c(-179.9999, 90),
                                     c(-179.9999, -90),
                                     c(-180.0001, -90),
                                     c(-180.0001, 90)))) |>
  st_sfc() |>
  st_set_crs(4326)

land_wrapped <- land_wrapped |>
  st_difference(polygon)
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
countries_wrapped <- countries_wrapped |>
  st_difference(polygon)
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
# Washing data combined
countries_sf_water <- left_join(
    countries_wrapped,
  proportion_of_safe_water_map,
  by = c("WB_A3" = "iso")
)
Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 3 of `x` matches multiple rows in `y`.
ℹ Row 6 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
countries_sf_water
Simple feature collection with 325 features and 57 fields
Geometry type: MULTILINESTRING
Dimension:     XY
Bounding box:  xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.6341
Geodetic CRS:  WGS 84
# A tibble: 325 × 58
   REGION_WB   ISO_N3 ISO_A3_EH NAME_AR ISO_A3 GDP_MD_EST NAME_VI WB_NAME ISO_A2
   <chr>       <chr>  <chr>     <chr>   <chr>       <dbl> <chr>   <chr>   <chr> 
 1 East Asia … 360    IDN       إندوني… IDN       3028000 Indone… Indone… ID    
 2 East Asia … 458    MYS       ماليزيا MYS        863000 Malays… Malays… MY    
 3 Latin Amer… 152    CHL       تشيلي   CHL        436100 Chile   Chile   CL    
 4 Latin Amer… 152    CHL       تشيلي   CHL        436100 Chile   Chile   CL    
 5 Latin Amer… 068    BOL       بوليفيا BOL         78350 Bolivia Bolivia BO    
 6 Latin Amer… 604    PER       بيرو    PER        410400 Peru    Peru    PE    
 7 Latin Amer… 604    PER       بيرو    PER        410400 Peru    Peru    PE    
 8 Latin Amer… 604    PER       بيرو    PER        410400 Peru    Peru    PE    
 9 Latin Amer… 032    ARG       الأرجن… ARG        879400 Argent… Argent… AR    
10 Europe & C… 196    CYP       قبرص    CYP         29260 Cộng h… Cyprus  CY    
# ℹ 315 more rows
# ℹ 49 more variables: UN_A3 <chr>, FIPS_10_ <chr>, NAME_EN <chr>,
#   NAME_EL <chr>, WB_RULES <chr>, OBJECTID <int>, NAME_DE <chr>,
#   SUBREGION <chr>, featurecla <chr>, NAME_ZH <chr>, WB_A3 <chr>,
#   LASTCENSUS <int>, TYPE <chr>, NAME_PT <chr>, NAME_BN <chr>,
#   FORMAL_EN <chr>, POP_YEAR <int>, REGION_UN <chr>, LEVEL <int>,
#   POP_RANK <int>, NAME_IT <chr>, FORMAL_FR <chr>, CONTINENT <chr>, …
# Mortality Rate Combined
countries_sf_mortality <- left_join(
    countries_wrapped,
  mortality_rate_unsafe_water_map,
  by = c("WB_A3" = "iso")
)

countries_sf_mortality
Simple feature collection with 197 features and 56 fields
Geometry type: MULTILINESTRING
Dimension:     XY
Bounding box:  xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.6341
Geodetic CRS:  WGS 84
# A tibble: 197 × 57
   REGION_WB   ISO_N3 ISO_A3_EH NAME_AR ISO_A3 GDP_MD_EST NAME_VI WB_NAME ISO_A2
   <chr>       <chr>  <chr>     <chr>   <chr>       <dbl> <chr>   <chr>   <chr> 
 1 East Asia … 360    IDN       إندوني… IDN       3028000 Indone… Indone… ID    
 2 East Asia … 458    MYS       ماليزيا MYS        863000 Malays… Malays… MY    
 3 Latin Amer… 152    CHL       تشيلي   CHL        436100 Chile   Chile   CL    
 4 Latin Amer… 068    BOL       بوليفيا BOL         78350 Bolivia Bolivia BO    
 5 Latin Amer… 604    PER       بيرو    PER        410400 Peru    Peru    PE    
 6 Latin Amer… 032    ARG       الأرجن… ARG        879400 Argent… Argent… AR    
 7 Europe & C… 196    CYP       قبرص    CYP         29260 Cộng h… Cyprus  CY    
 8 South Asia  356    IND       الهند   IND       8721000 Ấn Độ   India   IN    
 9 East Asia … 156    CHN       جمهوري… CHN      21140000 Cộng h… China   CN    
10 Middle Eas… 376    ISR       إسرائيل ISR        297000 Israel  Israel  IL    
# ℹ 187 more rows
# ℹ 48 more variables: UN_A3 <chr>, FIPS_10_ <chr>, NAME_EN <chr>,
#   NAME_EL <chr>, WB_RULES <chr>, OBJECTID <int>, NAME_DE <chr>,
#   SUBREGION <chr>, featurecla <chr>, NAME_ZH <chr>, WB_A3 <chr>,
#   LASTCENSUS <int>, TYPE <chr>, NAME_PT <chr>, NAME_BN <chr>,
#   FORMAL_EN <chr>, POP_YEAR <int>, REGION_UN <chr>, LEVEL <int>,
#   POP_RANK <int>, NAME_IT <chr>, FORMAL_FR <chr>, CONTINENT <chr>, …

8 Creating the Maps

8.1 Get the different values for all areas, urban and rural in water management

countries_sf_water_all <- countries_sf_water %>% filter(!(urbanisation %in% c("Rural","Urban")))
countries_sf_water_urban <- countries_sf_water %>% filter(!(urbanisation %in% c("All area","Urban")))
countries_sf_water_rural <- countries_sf_water %>% filter(!(urbanisation %in% c("Urban","All areas")))

8.1.1 all areas for water management

# Load the required libraries
library(leaflet)

# Define the color palette function
colorPalette <- colorRampPalette(c("#FF6969", "#00A0D6"))


# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)

# Create a Leaflet map
all_map <- leaflet(countries_sf_water_all) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(value_2019),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~WB_A3,  
              popup = ~paste("Country Code: ", WB_A3,
                             "<br>Country: ", country,
                             "<br>Proportion of population access to safely managed drinking water service: ", value_2019))%>%
  addLegend(
    position = "bottomright",
    pal = colorNumeric(palette = colors, domain = c(0, 100)),
    values = c(0, 20, 40, 60, 80, 100),
    title = "Proportion (%)",
    labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
    opacity = 1
  )

all_map

8.1.2 Urban areas

# Define the color palette function
colorPalette <- colorRampPalette(c("#FF6969", "#00A0D6"))


# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)

# Create a Leaflet map
urban_map <- leaflet(countries_sf_water_urban) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(value_2019),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~WB_A3,
              popup = ~paste("Country Code: ", WB_A3,
                             "<br>Country: ", country,
                             "<br>Proportion of population access to safely managed drinking water service: ", value_2019))%>%
  addLegend(
    position = "bottomright",
    pal = colorNumeric(palette = colors, domain = c(0, 100)),
    values = c(0, 20, 40, 60, 80, 100),
    title = "Proportion (%)",
    labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
    opacity = 1
  )

urban_map

8.1.3 Rural

# Define the color palette function
colorPalette <- colorRampPalette(c("#FF6969", "#00A0D6"))


# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)

# Create a Leaflet map
rural_map <- leaflet(countries_sf_water_rural) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(value_2019),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~WB_A3,  
              popup = ~paste("Country Code: ", WB_A3,
                             "<br>Country: ", country,
                             "<br>Proportion of population access to safely managed drinking water service: ", value_2019))%>%
  addLegend(
    position = "bottomright",
    pal = colorNumeric(palette = colors, domain = c(0, 100)),
    values = c(0, 20, 40, 60, 80, 100),
    title = "Proportion (%)",
    labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
    opacity = 1
  )

rural_map

8.2 Create map for mortality rate

# Define the color palette function
colorPalette <- colorRampPalette(c("#FFFFDF", "#FFEEAA", "#FFBB55", "#FF7700", "#FF4400"))


# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)

# Create a Leaflet map
mortality_mapping <- leaflet(countries_sf_mortality) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(mortality_rate_unsafe_water),  
              fillOpacity = 0.8,
              weight = 2,
              opacity = 1,
              color = "grey",
              dashArray = "3",
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
              label = ~WB_A3,  
              popup = ~paste("Country Code: ", WB_A3,
                             "<br>Country: ", country,
                             "<br>Mortality Rate from Unsafe Sanitation and Water per 100,000: ", mortality_rate_unsafe_water))%>%
  addLegend(
    position = "bottomright",
    pal = colorNumeric(palette = colors, domain = c(0, 100)),
    values = c(0, 20, 40, 60, 80, 100),
    title = "Proportion (100, 000)",
    labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
    opacity = 1
  )
Warning in colorNumeric(palette = colors, domain = c(0, 20, 40, 60, 80, : Some
values were outside the color scale and will be treated as NA
mortality_mapping
# Plots of Indicator between mortality rate due to unsafe drinking water and proportion of population using safely managed drinking water services.
merged_data <- merge(mortality_rate_unsafe_water, proportion_of_safe_water, by=c('geoAreaCode', 'geoAreaName'))
merged_data <- merged_data[!is.na(merged_data$value_2019), ] #remove NA values for 2019
# Display the first few rows of the merged dataframe
head(merged_data)
  geoAreaCode geoAreaName ObjectId.x goal_code.x goal_labelEN.x
1         100    Bulgaria        113           3         Goal 3
2         104     Myanmar          7           3         Goal 3
3         104     Myanmar          7           3         Goal 3
4         104     Myanmar          7           3         Goal 3
5         112     Belarus        102           3         Goal 3
6         116    Cambodia        117           3         Goal 3
                                                    goal_descEN.x target_code.x
1 Ensure healthy lives and promote well-being for all at all ages           3.9
2 Ensure healthy lives and promote well-being for all at all ages           3.9
3 Ensure healthy lives and promote well-being for all at all ages           3.9
4 Ensure healthy lives and promote well-being for all at all ages           3.9
5 Ensure healthy lives and promote well-being for all at all ages           3.9
6 Ensure healthy lives and promote well-being for all at all ages           3.9
                                                                                                                                target_descEN.x
1 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
2 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
3 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
4 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
5 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
6 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
  indicator_code.x indicator_reference.x
1          C030902                 3.9.2
2          C030902                 3.9.2
3          C030902                 3.9.2
4          C030902                 3.9.2
5          C030902                 3.9.2
6          C030902                 3.9.2
                                                                                                                                           indicator_descEN.x
1 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
2 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
3 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
4 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
5 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
6 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
  series_release.x
1     2021.Q2.G.03
2     2021.Q2.G.03
3     2021.Q2.G.03
4     2021.Q2.G.03
5     2021.Q2.G.03
6     2021.Q2.G.03
                                                                                                   series_tags.x
1 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
2 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
3 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
4 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
5 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
6 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
     series.x
1 SH_STA_WASH
2 SH_STA_WASH
3 SH_STA_WASH
4 SH_STA_WASH
5 SH_STA_WASH
6 SH_STA_WASH
                                                                                               seriesDescription.x
1 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
2 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
3 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
4 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
5 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
6 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
  level_.x parentCode.x       parentName.x  type.x       X.x      Y.x ISO3.x
1        5          151     Eastern Europe Country  25.23763 42.75731    BGR
2        4           35 South-Eastern Asia Country  96.51752 21.19333    MMR
3        4           35 South-Eastern Asia Country  96.51752 21.19333    MMR
4        4           35 South-Eastern Asia Country  96.51752 21.19333    MMR
5        5          151     Eastern Europe Country  28.04940 53.54193    BLR
6        4           35 South-Eastern Asia Country 104.92284 12.71164    KHM
  UN_Member.x Country_Profile.x UnitMultiplier.x   Units_code.x
1           1                 1               NA PER_100000_POP
2           1                 1               NA PER_100000_POP
3           1                 1               NA PER_100000_POP
4           1                 1               NA PER_100000_POP
5           1                 1               NA PER_100000_POP
6           1                 1               NA PER_100000_POP
            Units_desc.x timeSeries_id.x timeSeries_keys.x n_years.x min_year.x
1 Per 100,000 population     SH_STA_WASH                NA         1       2016
2 Per 100,000 population     SH_STA_WASH                NA         1       2016
3 Per 100,000 population     SH_STA_WASH                NA         1       2016
4 Per 100,000 population     SH_STA_WASH                NA         1       2016
5 Per 100,000 population     SH_STA_WASH                NA         1       2016
6 Per 100,000 population     SH_STA_WASH                NA         1       2016
  max_year.x years.x value_2016.x latest_value.x footnotes.x          nature.x
1       2016  [2016]          0.1            0.1          NA E: Estimated data
2       2016  [2016]         12.6           12.6          NA E: Estimated data
3       2016  [2016]         12.6           12.6          NA E: Estimated data
4       2016  [2016]         12.6           12.6          NA E: Estimated data
5       2016  [2016]          0.1            0.1          NA E: Estimated data
6       2016  [2016]          6.5            6.5          NA E: Estimated data
  ObjectId.y goal_code.y goal_labelEN.y
1        263           6         Goal 6
2        204           6         Goal 6
3        203           6         Goal 6
4        202           6         Goal 6
5        250           6         Goal 6
6        264           6         Goal 6
                                                                   goal_descEN.y
1 Ensure availability and sustainable management of water and sanitation for all
2 Ensure availability and sustainable management of water and sanitation for all
3 Ensure availability and sustainable management of water and sanitation for all
4 Ensure availability and sustainable management of water and sanitation for all
5 Ensure availability and sustainable management of water and sanitation for all
6 Ensure availability and sustainable management of water and sanitation for all
  target_code.y
1           6.1
2           6.1
3           6.1
4           6.1
5           6.1
6           6.1
                                                                                target_descEN.y
1 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
2 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
3 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
4 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
5 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
6 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
  indicator_code.y indicator_reference.y
1          C060101                 6.1.1
2          C060101                 6.1.1
3          C060101                 6.1.1
4          C060101                 6.1.1
5          C060101                 6.1.1
6          C060101                 6.1.1
                                                     indicator_descEN.y
1 Proportion of population using safely managed drinking water services
2 Proportion of population using safely managed drinking water services
3 Proportion of population using safely managed drinking water services
4 Proportion of population using safely managed drinking water services
5 Proportion of population using safely managed drinking water services
6 Proportion of population using safely managed drinking water services
  series_release.y               series_tags.y    series.y
1     2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
2     2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
3     2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
4     2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
5     2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
6     2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
                                                                        seriesDescription.y
1 Proportion of population using safely managed drinking water services, by urban/rural (%)
2 Proportion of population using safely managed drinking water services, by urban/rural (%)
3 Proportion of population using safely managed drinking water services, by urban/rural (%)
4 Proportion of population using safely managed drinking water services, by urban/rural (%)
5 Proportion of population using safely managed drinking water services, by urban/rural (%)
6 Proportion of population using safely managed drinking water services, by urban/rural (%)
  level_.y parentCode.y       parentName.y  type.y       X.y      Y.y ISO3.y
1        5          151     Eastern Europe Country  25.23763 42.75731    BGR
2        4           35 South-Eastern Asia Country  96.51752 21.19333    MMR
3        4           35 South-Eastern Asia Country  96.51752 21.19333    MMR
4        4           35 South-Eastern Asia Country  96.51752 21.19333    MMR
5        5          151     Eastern Europe Country  28.04940 53.54193    BLR
6        4           35 South-Eastern Asia Country 104.92284 12.71164    KHM
  UN_Member.y Country_Profile.y location_code location_desc UnitMultiplier.y
1           1                 1            _T     All areas               NA
2           1                 1             U         Urban               NA
3           1                 1             R         Rural               NA
4           1                 1            _T     All areas               NA
5           1                 1            _T     All areas               NA
6           1                 1            _T     All areas               NA
  Units_code.y Units_desc.y timeSeries_id.y timeSeries_keys.y n_years.y
1           PT   Percentage SH_H2O_SAFE___T          location        21
2           PT   Percentage  SH_H2O_SAFE__U          location        21
3           PT   Percentage  SH_H2O_SAFE__R          location        21
4           PT   Percentage SH_H2O_SAFE___T          location        21
5           PT   Percentage SH_H2O_SAFE___T          location        21
6           PT   Percentage SH_H2O_SAFE___T          location        21
  min_year.y max_year.y
1       2000       2020
2       2000       2020
3       2000       2020
4       2000       2020
5       2000       2020
6       2000       2020
                                                                                                                         years.y
1 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
2 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
3 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
4 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
5 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
6 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
  value_2000 value_2001 value_2002 value_2003 value_2004 value_2005 value_2006
1         90         90         91         91         92         92         93
2         51         52         54         56         58         59         61
3         18         20         21         23         24         26         27
4         27         29         30         32         33         35         37
5         81         81         81         81         83         84         85
6         17         17         18         18         19         19         20
  value_2007 value_2008 value_2009 value_2010 value_2011 value_2012 value_2013
1         93         94         94         95         95         96         96
2         63         65         67         68         70         70         71
3         29         30         32         34         35         37         39
4         38         40         42         44         45         47         48
5         86         88         89         90         91         92         93
6         20         21         21         22         23         23         24
  value_2014 value_2015 value_2016.y value_2017 value_2018 value_2019
1         97         97           97         97         97         98
2         71         71           72         72         73         73
3         41         43           45         47         49         51
4         50         51           53         54         56         58
5         94         94           95         95         95         95
6         24         25           25         26         27         27
  value_2020 latest_value.y footnotes.y          nature.y
1         98             98          NA E: Estimated data
2         74             74          NA E: Estimated data
3         52             52          NA E: Estimated data
4         59             59          NA E: Estimated data
5         95             95          NA E: Estimated data
6         28             28          NA E: Estimated data

8.3 Correlation

# Plot the data
plot_loess <- ggplot(merged_data, aes(x = latest_value.x, y = value_2019)) +
  
  geom_point(aes(color = location_desc, 
                 shape = location_desc,
                 text = paste("Country:", geoAreaName, "<br>", "Location:", location_desc)), alpha = 0.7) +
  
  geom_smooth(data = subset(merged_data, location_desc == "All areas"),
              method = loess, se = FALSE, 
              aes(color = location_desc, weight = value_2019),
              show.legend = TRUE) +
  
  geom_smooth(data = subset(merged_data, location_desc == "Urban"),
              method = loess,
              se = FALSE,
              aes(color = location_desc, weight = value_2019),
              show.legend = TRUE) +

  geom_smooth(data = subset(merged_data, location_desc == "Rural"),
              method = loess,
              se = FALSE,
              aes(color = location_desc, weight = value_2019),
              show.legend = TRUE) +
  
  labs(x = "Mortality Rate (deaths per 100,000 population)",
       y = "Safely Managed Drinking Water Services (%)",
       title = "Correlation between Mortality Rate and Water Services (2019)",
       color = "Location Type",
       shape = "Location Type") +
  
  theme_minimal() +
  
  guides(color = guide_legend(override.aes = list(shape = 16)),
         shape = guide_legend(override.aes = list(color = "black")))
Warning in geom_point(aes(color = location_desc, shape = location_desc, :
Ignoring unknown aesthetics: text
# Convert to a plotly plot
gp <- ggplotly(plot_loess, tooltip = "text")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Print the plot
gp
correlation_all <- cor(merged_data$latest_value.x[merged_data$location_desc == "All areas"], 
                       merged_data$value_2019[merged_data$location_desc == "All areas"])

correlation_urban <- cor(merged_data$latest_value.x[merged_data$location_desc == "Urban"], 
                         merged_data$value_2019[merged_data$location_desc == "Urban"])

correlation_rural <- cor(merged_data$latest_value.x[merged_data$location_desc == "Rural"], 
                         merged_data$value_2019[merged_data$location_desc == "Rural"])

blank_plot <- ggplot() +
  theme_void() +
  theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
  annotate("text",
           x = 0.5,
           y = 0.5,
           label = paste("Correlation All:",
                         round(correlation_all, 2), "\n",
                         "Correlation Urban:",
                         round(correlation_urban, 2), "\n",
                         "Correlation Rural:",
                         round(correlation_rural, 2)),
           size = 5,
           color = "black",
           hjust = 0.5)

# Convert to a plotly plot
gp_text <- ggplotly(blank_plot)
gp_text

8.4 Barchart and stacked barchart (without Transformation)

stacked_bar_chart <- ggplot(merged_data, aes(x = latest_value.x, fill = location_desc)) +
  geom_histogram(position = "stack", bins = 20) +
  labs(x = "Mortality Rate (deaths per 100,000 population)",
       y = "Count",
       title = "Stacked Bar Chart of Mortality Rates") +
  theme_minimal()

#change to plotly
ggplot_stacked_bar_chart <- ggplotly(stacked_bar_chart)
ggplot_stacked_bar_chart
grouped_bar_chart <- ggplot(merged_data, aes(x = latest_value.x, fill = location_desc)) +
  geom_histogram(position = "dodge", bins = 20) +
  labs(x = "Mortality Rate (deaths per 100,000 population)",
       y = "Count",
       title = "Grouped Bar Chart of Mortality Rates") +
  theme_minimal()

# Convert to a plotly plot
ggplot_grouped_bar_chart <- ggplotly(grouped_bar_chart)
ggplot_grouped_bar_chart
# Filter data for the histogram
merged_data_all<- subset(merged_data, location_desc == "All areas")  # change to rural or urban if you want

histogram <- ggplot(merged_data_all, aes(x = latest_value.x, 
                             text = paste("Country:", geoAreaName, 
                                          "<br>Mortality Rate:", latest_value.x))) +
  geom_histogram(bins = 50, fill = 'grey', color = 'black') +
  labs(x = 'Mortality Rate (deaths per 100,000 population)',
       y = 'Count',
       title = 'Histogram of Mortality Rates for "All" locations') +
  theme_minimal()

# Convert ggplot histogram to a plotly plot
plotly_histogram <- ggplotly(histogram, tooltip = "text")

8.5 Normalise Data

merged_data$latest_value.x_positive <- merged_data$latest_value.x + abs(min(merged_data$latest_value.x)) + 0.1

# Apply the Box-Cox transformation
bc_result <- MASS::boxcox(merged_data$latest_value.x_positive ~ 1, 
                    lambda = seq(-3,3,0.1))

# The optimal lambda value is the one that maximizes the log-likelihood
optimal_lambda <- bc_result$x[which.max(bc_result$y)]

# Transform the data using the optimal lambda value
if (optimal_lambda == 0) {
  merged_data$latest_value.x_bc <- log(merged_data$latest_value.x_positive)
} else {
  merged_data$latest_value.x_bc <- (merged_data$latest_value.x_positive^optimal_lambda - 1) / optimal_lambda
}

# Shift the transformed variable to be non-negative
min_value <- min(merged_data$latest_value.x_bc)
merged_data$latest_value.x_bc <- merged_data$latest_value.x_bc + abs(min_value) + 0.1

8.6 Histogram (Normalised)

# Round the transformed mortality rates to the nearest whole number
merged_data$latest_value.x_bc_rounded <- round(merged_data$latest_value.x_bc)

# Convert the latest_value.x_bc_rounded variable into a categorical variable
merged_data$latest_value.x_bc_cat <- cut(merged_data$latest_value.x_bc_rounded, breaks = 50)

# Create a stacked bar chart
stacked_bar_chart_bc <- ggplot(merged_data, aes(x = latest_value.x_bc_rounded, fill = location_desc)) +
  geom_bar(position = "stack", bins =20) +
  labs(x = "Transformed Mortality Rate",
       y = "Count",
       title = "Stacked Bar Chart of Transformed Mortality Rates") +
  theme_minimal()
Warning in geom_bar(position = "stack", bins = 20): Ignoring unknown
parameters: `bins`
ggplot_stacked_bar_chart_bc <- ggplotly(stacked_bar_chart_bc)
ggplot_stacked_bar_chart_bc
# Create a grouped bar chart
grouped_bar_chart_bc <- ggplot(merged_data, aes(x = latest_value.x_bc_rounded, fill = location_desc)) +
  geom_bar(position = "dodge") +
  labs(x = "Transformed Mortality Rate",
       y = "Count",
       title = "Grouped Bar Chart of Transformed Mortality Rates") +
  theme_minimal()

# Convert to a plotly plot
ggplot_grouped_bar_chart_bc <- ggplotly(grouped_bar_chart_bc)
ggplot_grouped_bar_chart_bc

8.7 Scatter plot Normalised

plot_loess <- ggplot(merged_data, aes(x = latest_value.x_bc, y = value_2019)) +
  
  geom_point(aes(color = location_desc, 
                 shape = location_desc,
                 text = paste("Country:", geoAreaName, "<br>", "Location:", location_desc)), alpha = 0.7) +
  
  geom_smooth(data = subset(merged_data, location_desc == "All areas"),
              method = loess, se = FALSE, 
              aes(color = location_desc, weight = value_2019),
              show.legend = TRUE) +
  
  geom_smooth(data = subset(merged_data, location_desc == "Urban"),
              method = loess,
              se = FALSE,
              aes(color = location_desc, weight = value_2019),
              show.legend = TRUE) +

  geom_smooth(data = subset(merged_data, location_desc == "Rural"),
              method = loess,
              se = FALSE,
              aes(color = location_desc, weight = value_2019),
              show.legend = TRUE) +
  
  labs(x = "Transformed Mortality Rate (deaths per 100,000 population)",
       y = "Safely Managed Drinking Water Services (%)",
       title = "Correlation between Transformed Mortality Rate and Water Services (Normalised)",
       color = "Location Type",
       shape = "Location Type") +
  
  theme_minimal() +
  
  guides(color = guide_legend(override.aes = list(shape = 16)),
         shape = guide_legend(override.aes = list(color = "black")))
Warning in geom_point(aes(color = location_desc, shape = location_desc, :
Ignoring unknown aesthetics: text
gp_normalised <- ggplotly(plot_loess, tooltip = "text")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
gp_normalised

8.8 Correlation normalised

correlation_all <- cor(merged_data$latest_value.x_bc[merged_data$location_desc == "All areas"], 
                       merged_data$value_2019[merged_data$location_desc == "All areas"])

correlation_urban <- cor(merged_data$latest_value.x_bc[merged_data$location_desc == "Urban"], 
                         merged_data$value_2019[merged_data$location_desc == "Urban"])

correlation_rural <- cor(merged_data$latest_value.x_bc[merged_data$location_desc == "Rural"], 
                         merged_data$value_2019[merged_data$location_desc == "Rural"])

blank_plot <- ggplot() +
  theme_void() +
  theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
  annotate("text",
           x = 0.5,
           y = 0.5,
           label = paste("Correlation All:",
                         round(correlation_all, 2), "\n",
                         "Correlation Urban:",
                         round(correlation_urban, 2), "\n",
                         "Correlation Rural:",
                         round(correlation_rural, 2)),
           size = 5,
           color = "black",
           hjust = 0.5)

gp_text_normalised <- ggplotly(blank_plot)
gp_text_normalised

8.9 Combining normal and unormalised plots

combined_plot <- subplot(
  gp, gp_text, 
  gp_normalised, gp_text_normalised, 
  nrows = 2, margin = 0.05
)

# Print the combined plot
combined_plot

9 Four Indicators

#mortality_rate_attributed_to_unsafe_water
mortality_rate_unsafe_water_indicators <- read_csv("data/indicator_3.9.2.csv",show_col_types = FALSE)

#Proportion_of_population_using_safely_managed_drinking_water_services
proportion_of_safe_water_indicators<- read_csv("data/indicator_6.1.1.csv",show_col_types = FALSE)

#mortality_rate_unintentional_poisoning
mortality_rate_unintentional_poisoning_indicators <- read_csv("data/indicator_3.9.3.csv",show_col_types = FALSE)

#proportion-of-population-with-basic-handwashing-facilities-on-premises-by-urban-rural-percent
population_with_basic_handwashing_facilities_indicators<- read_csv("data/indicator_6.2.1.csv",show_col_types = FALSE)
mortality_rate_unsafe_water_indicators <- dplyr::select(
  mortality_rate_unsafe_water_indicators,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  MR_unsafe_water = 'latest_value'
)

mortality_rate_unsafe_water_indicators <- mortality_rate_unsafe_water_indicators %>%
  distinct()

mortality_rate_unsafe_water_indicators
# A tibble: 183 × 5
    code country                          region           iso   MR_unsafe_water
   <dbl> <chr>                            <chr>            <chr>           <dbl>
 1   583 Micronesia (Federated States of) Oceania (exc. A… FSM               3.6
 2   496 Mongolia                         Eastern Asia     MNG               1.3
 3   499 Montenegro                       Southern Europe  MNE               0  
 4   504 Morocco                          Northern Africa  MAR               1.9
 5   508 Mozambique                       Eastern Africa   MOZ              27.6
 6   104 Myanmar                          South-Eastern A… MMR              12.6
 7   516 Namibia                          Southern Africa  NAM              18.3
 8   524 Nepal                            Southern Asia (… NPL              19.8
 9   528 Netherlands                      Western Europe   NLD               0.2
10   554 New Zealand                      Australia and N… NZL               0.1
# ℹ 173 more rows
proportion_of_safe_water_indicators <- dplyr::select(
  proportion_of_safe_water_indicators,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  urbanisation = "location_desc",
  sanitation_access = "value_2019",
)

# Remove duplicates
proportion_of_safe_water_indicators <- proportion_of_safe_water_indicators%>%
  distinct()

proportion_of_safe_water_indicators
# A tibble: 286 × 6
    code country                     region iso   urbanisation sanitation_access
   <dbl> <chr>                       <chr>  <chr> <chr>                    <dbl>
 1   807 North Macedonia             South… MKD   Rural                       66
 2   804 Ukraine                     Easte… UKR   Urban                       89
 3   807 North Macedonia             South… MKD   Urban                       85
 4   826 United Kingdom of Great Br… North… GBR   All areas                  100
 5   580 Northern Mariana Islands    Ocean… MNP   All areas                   91
 6   840 United States of America    North… USA   All areas                   97
 7   578 Norway                      North… NOR   All areas                   99
 8   512 Oman                        Weste… OMN   All areas                   90
 9   586 Pakistan                    South… PAK   All areas                   36
10   586 Pakistan                    South… PAK   Rural                       33
# ℹ 276 more rows
mortality_rate_unintentional_poisoning_indicators <- dplyr::select(
  mortality_rate_unintentional_poisoning_indicators,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  gender = 'sex_desc',
  MR_poisoning = 'value_2019'
)

# drop rows if contains at least one NA value
mortality_rate_unintentional_poisoning_indicators <- na.omit(mortality_rate_unintentional_poisoning_indicators)

# Remove duplicates
mortality_rate_unintentional_poisoning_indicators <- mortality_rate_unintentional_poisoning_indicators%>%
  distinct()

#check for empty row
any(is.na(mortality_rate_unintentional_poisoning_indicators))
[1] FALSE
mortality_rate_unintentional_poisoning_indicators
# A tibble: 549 × 6
    code country     region                          iso   gender   MR_poisoning
   <dbl> <chr>       <chr>                           <chr> <chr>           <dbl>
 1   470 Malta       Southern Europe                 MLT   Both se…          0.1
 2   152 Chile       South America                   CHL   Male              0.5
 3     4 Afghanistan Southern Asia (excluding India) AFG   Both se…          1  
 4   470 Malta       Southern Europe                 MLT   Female            0  
 5   156 China       Eastern Asia                    CHN   Both se…          1.8
 6   470 Malta       Southern Europe                 MLT   Male              0.2
 7   156 China       Eastern Asia                    CHN   Female            1.5
 8   478 Mauritania  Western Africa                  MRT   Both se…          1.5
 9   478 Mauritania  Western Africa                  MRT   Female            1.3
10   478 Mauritania  Western Africa                  MRT   Male              1.7
# ℹ 539 more rows
population_with_basic_handwashing_facilities_indicators <- dplyr::select(
  population_with_basic_handwashing_facilities_indicators,
  code = 'geoAreaCode',
  country = 'geoAreaName',
  region = 'parentName',
  iso = 'ISO3',
  urbanisation = 'location_desc',
  handwash_access = 'value_2019'
)

# drop rows if contains at least one NA value
population_with_basic_handwashing_facilities_indicators <- na.omit(population_with_basic_handwashing_facilities_indicators)

# Remove duplicates
population_with_basic_handwashing_facilities_indicators <- population_with_basic_handwashing_facilities_indicators%>%
  distinct()

#check for empty row
any(is.na(mortality_rate_unintentional_poisoning_indicators))
[1] FALSE
population_with_basic_handwashing_facilities_indicators
# A tibble: 250 × 6
    code country      region                  iso   urbanisation handwash_access
   <dbl> <chr>        <chr>                   <chr> <chr>                  <dbl>
 1     4 Afghanistan  Southern Asia (excludi… AFG   All areas                 38
 2   356 India        Southern Asia           IND   Urban                     82
 3   360 Indonesia    South-Eastern Asia      IDN   All areas                 94
 4     4 Afghanistan  Southern Asia (excludi… AFG   Rural                     29
 5   694 Sierra Leone Western Africa          SLE   Rural                     18
 6   360 Indonesia    South-Eastern Asia      IDN   Rural                     91
 7     4 Afghanistan  Southern Asia (excludi… AFG   Urban                     64
 8    12 Algeria      Northern Africa         DZA   All areas                 85
 9    12 Algeria      Northern Africa         DZA   Rural                     75
10    12 Algeria      Northern Africa         DZA   Urban                     88
# ℹ 240 more rows

9.1 Joining all 4 indicators

countries_tb <- left_join(
  mortality_rate_unintentional_poisoning_indicators,
  population_with_basic_handwashing_facilities_indicators,
  by = join_by(country, iso)
)
  
countries_tb <- left_join(
  countries_tb,
  mortality_rate_unsafe_water_indicators,
  by = join_by(iso)
) 
  
countries_tb <- left_join(
  countries_tb,
  proportion_of_safe_water_indicators,
  by = join_by(iso, urbanisation)
) |>  
  mutate(
    iso = case_match(
      iso,
      "COD" ~ "ZAR",
      "ROU" ~ "ROM",
      "TLS" ~ "TMP",
      "XKX" ~ "KSV",
      .default = iso
    )
  )

names(countries_tb)
 [1] "code.x"            "country.x"         "region.x"         
 [4] "iso"               "gender"            "MR_poisoning"     
 [7] "code.y"            "region.y"          "urbanisation"     
[10] "handwash_access"   "code.x.x"          "country.y"        
[13] "region.x.x"        "MR_unsafe_water"   "code.y.y"         
[16] "country"           "region.y.y"        "sanitation_access"

9.2 Joining with adminGeoJson

countries_sf <- read_sf("./data/WB_countries_Admin0.geojson")
countries_tb <- left_join(
  countries_tb,
  dplyr::select(countries_sf,WB_A3, GDP_MD_EST, INCOME_GRP, POP_EST),
  by = c("iso" = "WB_A3")
)
Warning in left_join(countries_tb, dplyr::select(countries_sf, WB_A3, GDP_MD_EST, : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 325 of `x` matches multiple rows in `y`.
ℹ Row 126 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
countries_tb
# A tibble: 1,038 × 22
   code.x country.x   region.x         iso   gender MR_poisoning code.y region.y
    <dbl> <chr>       <chr>            <chr> <chr>         <dbl>  <dbl> <chr>   
 1    470 Malta       Southern Europe  MLT   Both …          0.1     NA <NA>    
 2    152 Chile       South America    CHL   Male            0.5     NA <NA>    
 3      4 Afghanistan Southern Asia (… AFG   Both …          1        4 Souther…
 4      4 Afghanistan Southern Asia (… AFG   Both …          1        4 Souther…
 5      4 Afghanistan Southern Asia (… AFG   Both …          1        4 Souther…
 6    470 Malta       Southern Europe  MLT   Female          0       NA <NA>    
 7    156 China       Eastern Asia     CHN   Both …          1.8     NA <NA>    
 8    470 Malta       Southern Europe  MLT   Male            0.2     NA <NA>    
 9    156 China       Eastern Asia     CHN   Female          1.5     NA <NA>    
10    478 Mauritania  Western Africa   MRT   Both …          1.5    478 Western…
# ℹ 1,028 more rows
# ℹ 14 more variables: urbanisation <chr>, handwash_access <dbl>,
#   code.x.x <dbl>, country.y <chr>, region.x.x <chr>, MR_unsafe_water <dbl>,
#   code.y.y <dbl>, country <chr>, region.y.y <chr>, sanitation_access <dbl>,
#   GDP_MD_EST <dbl>, INCOME_GRP <chr>, POP_EST <int>,
#   geometry <MULTIPOLYGON [°]>

9.3 Clean data

Remove certain columns and rename column for cleaner a look

# Drop
clean_datatable <- dplyr::select(countries_tb, 
                          -code.y, -code.x,
                          -region.y, -region.x,
                          -code.x.x, code.y.y, 
                          -country.x, -country.y,
                          -region.x.x, -region.y.y, 
                          -iso)  
# Rename
clean_datatable <- clean_datatable %>% rename(
  gdp = GDP_MD_EST,
  income_group = INCOME_GRP,
  population = POP_EST)

# Rearrange
clean_datatable <- clean_datatable %>%
  dplyr::select(country,
         gdp,
         income_group,
         population,
         urbanisation, 
         gender, 
         handwash_access,  
         sanitation_access, 
         MR_poisoning, 
         MR_unsafe_water,)


datatable(clean_datatable)

9.4 Correlation between 4 indicators

# filter gender of both sexes to reduce duplicates
countries_tb <- countries_tb |>
  filter(gender == "Both sexes") |>
  na.omit(countries_tb)
countries_tb
# A tibble: 143 × 22
   code.x country.x   region.x         iso   gender MR_poisoning code.y region.y
    <dbl> <chr>       <chr>            <chr> <chr>         <dbl>  <dbl> <chr>   
 1      4 Afghanistan Southern Asia (… AFG   Both …          1        4 Souther…
 2      4 Afghanistan Southern Asia (… AFG   Both …          1        4 Souther…
 3      4 Afghanistan Southern Asia (… AFG   Both …          1        4 Souther…
 4    484 Mexico      Central America  MEX   Both …          0.4    484 Central…
 5    496 Mongolia    Eastern Asia     MNG   Both …          2.8    496 Eastern…
 6    496 Mongolia    Eastern Asia     MNG   Both …          2.8    496 Eastern…
 7    496 Mongolia    Eastern Asia     MNG   Both …          2.8    496 Eastern…
 8    499 Montenegro  Southern Europe  MNE   Both …          0.6    499 Souther…
 9    499 Montenegro  Southern Europe  MNE   Both …          0.6    499 Souther…
10    104 Myanmar     South-Eastern A… MMR   Both …          1.3    104 South-E…
# ℹ 133 more rows
# ℹ 14 more variables: urbanisation <chr>, handwash_access <dbl>,
#   code.x.x <dbl>, country.y <chr>, region.x.x <chr>, MR_unsafe_water <dbl>,
#   code.y.y <dbl>, country <chr>, region.y.y <chr>, sanitation_access <dbl>,
#   GDP_MD_EST <dbl>, INCOME_GRP <chr>, POP_EST <int>,
#   geometry <MULTIPOLYGON [°]>
countries_tb_subset <- dplyr::select(countries_tb, MR_poisoning, MR_unsafe_water, handwash_access, sanitation_access,urbanisation,country.x)

# Display the subsetted data
print(countries_tb_subset)
# A tibble: 143 × 6
   MR_poisoning MR_unsafe_water handwash_access sanitation_access urbanisation
          <dbl>           <dbl>           <dbl>             <dbl> <chr>       
 1          1              13.9              38                27 All areas   
 2          1              13.9              29                24 Rural       
 3          1              13.9              64                36 Urban       
 4          0.4             1.1              90                43 All areas   
 5          2.8             1.3              84                30 All areas   
 6          2.8             1.3              77                11 Rural       
 7          2.8             1.3              88                38 Urban       
 8          0.6             0                99                85 All areas   
 9          0.6             0                99                87 Urban       
10          1.3            12.6              74                58 All areas   
# ℹ 133 more rows
# ℹ 1 more variable: country.x <chr>

9.5 Scatterplot matrix

#set axis names
countries_tb_subset_corr4 <- dplyr::select(countries_tb_subset, MR_unsafe_water, MR_poisoning, handwash_access, sanitation_access, urbanisation,country.x)
colnames(countries_tb_subset_corr4) <- c("Unsafe Water Mortality Rate",
                                         "Poisoning Mortality",
                                         "Handwashing Access (%)",
                                         "Sanitation Access (%)",
                                         "Urbanisation",
                                         "Countries"
                                         )
countries_tb_subset_corr4
# A tibble: 143 × 6
   `Unsafe Water Mortality Rate` `Poisoning Mortality` `Handwashing Access (%)`
                           <dbl>                 <dbl>                    <dbl>
 1                          13.9                   1                         38
 2                          13.9                   1                         29
 3                          13.9                   1                         64
 4                           1.1                   0.4                       90
 5                           1.3                   2.8                       84
 6                           1.3                   2.8                       77
 7                           1.3                   2.8                       88
 8                           0                     0.6                       99
 9                           0                     0.6                       99
10                          12.6                   1.3                       74
# ℹ 133 more rows
# ℹ 3 more variables: `Sanitation Access (%)` <dbl>, Urbanisation <chr>,
#   Countries <chr>
#change to factor for colour

countries_tb_subset_corr4$Urbanisation <- as.factor(countries_tb_subset_corr4$Urbanisation)

plot <- ggpairs(data = countries_tb_subset_corr4,
                columns = 1:4,
                upper = list(continuous = "cor"),
                lower = list(continuous = "smooth", se=FALSE),
                diag = list(continuous = "bar", bin=30),
                mapping = aes(color = Urbanisation),
                tooltips = c("Countries")) +
  theme_minimal() +
  theme(axis.title = element_text(size = 5))
Warning in warn_if_args_exist(list(...)): Extra arguments: "tooltips" are being
ignored.  If these are meant to be aesthetics, submit them using the 'mapping'
variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
"densityDiag", : Changing diag$continuous from 'bar' to 'barDiag'
plot
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Convert ggplot to plotly
plot_interactive <- ggplotly(plot)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
plot_interactive <- layout(plot_interactive,
       font = list(size = 8)
)

# Render the plotly scatterplot matrix
plot_interactive

9.6 Different Scatter plot matrix

countries_tb_subset_corr4 <- dplyr::select(countries_tb_subset, MR_unsafe_water, MR_poisoning, handwash_access, sanitation_access, urbanisation)

# Rename the columns
colnames(countries_tb_subset_corr4) <- c("Unsafe Water Mortality Rate", 
                                         "Poisoning Mortality",
                                         "Handwashing Access (%)", 
                                         "Sanitation Access (%)",
                                         "Urbanisation")

# Make urbanisation as a factor for colourisation
countries_tb_subset_corr4$Urbanisation <- as.factor(countries_tb_subset_corr4$Urbanisation)

# Map urbanisation to colors
color_mapping <- c("All areas" = "red", "Urban" = "blue", "Rural" = "green") # Adjust to your actual factor levels and desired colors
colors <- color_mapping[countries_tb_subset_corr4$Urbanisation]

# Create scatterplot matrix without colors
plot <- pairs.panels(countries_tb_subset_corr4[, 1:4],
                     hist.col = "#00AFBB",
                     bg = "transparent")

10 K-means clustering

10.1 Scaling the dataset

df <- clean_datatable |>
  dplyr::filter(urbanisation == "All areas", gender == "Both sexes") |>
  dplyr::select(-urbanisation,-gender,-gdp) |>
  na.omit()

kable(head((df)))
country income_group population handwash_access sanitation_access MR_poisoning MR_unsafe_water
Afghanistan 5. Low income 34124811 38 27 1.0 13.9
Mexico 3. Upper middle income 124574795 90 43 0.4 1.1
Mongolia 4. Lower middle income 3068243 84 30 2.8 1.3
Montenegro 3. Upper middle income 642550 99 85 0.6 0.0
Myanmar 5. Low income 55123814 74 58 1.3 12.6
Nepal 5. Low income 29384297 61 19 1.7 19.8
numeric_cols <-
  c("handwash_access",
    "sanitation_access",
    "MR_poisoning",
    "MR_unsafe_water")
numeric_df <- df[numeric_cols]
scaled_df <- scale(numeric_df)
kable(head((scaled_df)))
handwash_access sanitation_access MR_poisoning MR_unsafe_water
-0.7196189 -0.6869441 -0.3505204 -0.2152268
0.9599226 -0.0631583 -0.8736852 -0.7322881
0.7661293 -0.5699842 1.2189739 -0.7242090
1.2506124 1.5742793 -0.6992969 -0.7767231
0.4431406 0.5216408 -0.0889380 -0.2677408
0.0232552 -0.9988370 0.2598385 0.0231062

10.2 Elbow Criterion

fviz_nbclust(scaled_df, kmeans, method = "wss")

** Elbow at 3, so we choose k=3 **

10.3 Do k-means clustering for dataset

scaled_df_table <- data.table(scaled_df)
k <- kmeans(scaled_df_table, centers = 3)
scaled_df_table[, cluster := as.factor(k$cluster)]

country <- df$country
income_group <- df$income_group
population <- df$population
scaled_df_table_with_country <-
  cbind(population, income_group, country, scaled_df_table)

kable(head(scaled_df_table_with_country))
population income_group country handwash_access sanitation_access MR_poisoning MR_unsafe_water cluster
34124811 5. Low income Afghanistan -0.7196189 -0.6869441 -0.3505204 -0.2152268 1
124574795 3. Upper middle income Mexico 0.9599226 -0.0631583 -0.8736852 -0.7322881 1
3068243 4. Lower middle income Mongolia 0.7661293 -0.5699842 1.2189739 -0.7242090 1
642550 3. Upper middle income Montenegro 1.2506124 1.5742793 -0.6992969 -0.7767231 2
55123814 5. Low income Myanmar 0.4431406 0.5216408 -0.0889380 -0.2677408 1
29384297 5. Low income Nepal 0.0232552 -0.9988370 0.2598385 0.0231062 1

11 Create pair plot highlighting clusters

ggpair_plt <- ggpairs(
  scaled_df_table_with_country,
  aes(colour = cluster, text = country),
  diag = list(continuous = "barDiag"),
  columns = c(
    "handwash_access",
    "sanitation_access",
    "MR_poisoning",
    "MR_unsafe_water"
  ),
  columnLabels = c(
    "% Hand-wash\n Facility",
    "% Safe Drinking\n Water",
    "% Death by\n Posion",
    "% Death by\n Unsafe Water"
  )
)
ggplotly_plot <- ggplotly(ggpair_plt, tooltip = c("country"))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
ggplotly_plot


1. Red cluster = Low mortality rate
2. Green cluster = High sanitation and Low mortality rate
3. Blue cluster = Low sanitation, high mortality rate

12 Decision Tree

12.1 Add country code to dataset

scaled_df_table_with_cc <-
  scaled_df_table_with_country |>
  mutate(code = countrycode(country, origin = 'country.name', destination = 'wb'))
kable(head(scaled_df_table_with_cc))
population income_group country handwash_access sanitation_access MR_poisoning MR_unsafe_water cluster code
34124811 5. Low income Afghanistan -0.7196189 -0.6869441 -0.3505204 -0.2152268 1 AFG
124574795 3. Upper middle income Mexico 0.9599226 -0.0631583 -0.8736852 -0.7322881 1 MEX
3068243 4. Lower middle income Mongolia 0.7661293 -0.5699842 1.2189739 -0.7242090 1 MNG
642550 3. Upper middle income Montenegro 1.2506124 1.5742793 -0.6992969 -0.7767231 2 MNE
55123814 5. Low income Myanmar 0.4431406 0.5216408 -0.0889380 -0.2677408 1 MMR
29384297 5. Low income Nepal 0.0232552 -0.9988370 0.2598385 0.0231062 1 NPL
any_na <- any(is.na(scaled_df_table_with_cc))
any_na
[1] FALSE
countries_sf <- read_sf("./data/WB_countries_Admin0.geojson")
countries_sf <- st_drop_geometry(countries_sf) |>
  dplyr::select(WB_A3, REGION_WB)

dataset <-
  merge(scaled_df_table_with_cc,
        countries_sf,
        by.x = "code",
        by.y = "WB_A3")
kable(head(dataset))
code population income_group country handwash_access sanitation_access MR_poisoning MR_unsafe_water cluster REGION_WB
AFG 34124811 5. Low income Afghanistan -0.7196189 -0.6869441 -0.3505204 -0.2152268 1 South Asia
ARM 3045191 4. Lower middle income Armenia 1.1214169 1.6132659 -0.6121028 -0.7686440 2 Europe & Central Asia
BGD 157826578 5. Low income Bangladesh -0.1705381 0.5216408 -0.9608793 -0.2960176 1 South Asia
BTN 758288 4. Lower middle income Bhutan 0.9922214 -0.3360646 -1.0480735 -0.6191809 1 South Asia
CAF 5625118 5. Low income Central African Republic -1.2686998 -1.5056629 1.2189739 2.5397406 3 Sub-Saharan Africa
CIV 24184810 4. Lower middle income Côte d’Ivoire -1.2686998 -0.3750512 0.9573916 1.1299406 3 Sub-Saharan Africa
any_na <- any(is.na(dataset))
any_na
[1] FALSE
dplyr::glimpse(dataset)
Rows: 49
Columns: 10
$ code              <chr> "AFG", "ARM", "BGD", "BTN", "CAF", "CIV", "COG", "CO…
$ population        <int> 34124811, 3045191, 157826578, 758288, 5625118, 24184…
$ income_group      <chr> "5. Low income", "4. Lower middle income", "5. Low i…
$ country           <chr> "Afghanistan", "Armenia", "Bangladesh", "Bhutan", "C…
$ handwash_access   <dbl> -0.7196189, 1.1214169, -0.1705381, 0.9922214, -1.268…
$ sanitation_access <dbl> -0.68694407, 1.61326593, 0.52164085, -0.33606458, -1…
$ MR_poisoning      <dbl> -0.35052041, -0.61210280, -0.96087932, -1.04807345, …
$ MR_unsafe_water   <dbl> -0.21522678, -0.76864397, -0.29601761, -0.61918093, …
$ cluster           <fct> 1, 2, 1, 1, 3, 3, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 1, 2…
$ REGION_WB         <chr> "South Asia", "Europe & Central Asia", "South Asia",…

12.2 Create factor vairables for dataset

shuffle_index <- sample(1:nrow(dataset))
dataset <- dataset[shuffle_index,]

column_types <- sapply(dataset, class)
print(column_types)
             code        population      income_group           country 
      "character"         "integer"       "character"       "character" 
  handwash_access sanitation_access      MR_poisoning   MR_unsafe_water 
        "numeric"         "numeric"         "numeric"         "numeric" 
          cluster         REGION_WB 
         "factor"       "character" 
unique(dataset$REGION_WB)
[1] "Sub-Saharan Africa"         "Latin America & Caribbean" 
[3] "East Asia & Pacific"        "Europe & Central Asia"     
[5] "South Asia"                 "Middle East & North Africa"
unique(dataset$income_group)
[1] "5. Low income"           "3. Upper middle income" 
[3] "4. Lower middle income"  "2. High income: nonOECD"
setnames(dataset, "REGION_WB", "region")

dataset <- dataset[, c("cluster", "region", "income_group", "population")]
 
dataset$income_group <- factor(
  dataset$income_group,
  levels = c(
    '5. Low income',
    '4. Lower middle income',
    '3. Upper middle income',
    '2. High income: nonOECD'
  ),
  labels = c('low', 'lower middle', 'upper middle', 'high')
)

dataset$region <- factor(
  dataset$region,
  levels = c(
    'Sub-Saharan Africa',
    'South Asia',
    'Middle East & North Africa',
    'Europe & Central Asia',
    'East Asia & Pacific',
    'Latin America & Caribbean'
  ),
  labels = c(
    'Sub-Saharan Africa',
    'South Asia',
    'Middle East & North Africa',
    'Europe & Central Asia',
    'East Asia & Pacific',
    'Latin America & Caribbean'
  )
)

any_na <- any(is.na(dataset))
any_na
[1] FALSE
clean_dataset <- dataset
dplyr::glimpse(clean_dataset)
Rows: 49
Columns: 4
$ cluster      <fct> 3, 1, 3, 3, 1, 3, 1, 2, 3, 3, 1, 3, 1, 1, 1, 2, 3, 1, 3, …
$ region       <fct> Sub-Saharan Africa, Latin America & Caribbean, Sub-Sahara…
$ income_group <fct> low, upper middle, lower middle, low, lower middle, lower…
$ population   <int> 39570125, 124574795, 190632261, 1792338, 3068243, 2418481…

12.3 Train decision tree

train_rows <-
  createDataPartition(clean_dataset$cluster, p = 0.75, list = FALSE)
train <- clean_dataset[train_rows, ]
test <- clean_dataset[-train_rows, ]

# dim(train)
# dim(test)
# prop.table(table(train$cluster))
# prop.table(table(test$cluster))

tree_model <- rpart(cluster ~ ., data = train, method = "class")

split_fun <- function(x, labs, digits, varlen, faclen)
{
  for (i in 1:length(labs)) {
    labs[i] <- paste(strwrap(labs[i], width = 40), collapse = "\n")
  }
  labs
}

rpart.plot(tree_model,
           extra = 102,
           digits = -3,
           split.fun = split_fun)

The decision tree only considered the region variable for classification.

13 Evaluation of decision tree

13.1 Relative importance

rel_impt <- as.data.frame(tree_model$variable.importance)
rel_impt
             tree_model$variable.importance
region                            18.012821
income_group                       6.492674
population                         1.738095

Relative importance of the predictors for classification are as follows:
- Region: 18.01
- Income: 6.49
- Population size: 1.74

13.2 Prediction with decision tree

set.seed(12345)
prediction <- predict(tree_model, newdata = test, type = "class")
confMatrix <- table(prediction, test$cluster)
confMatrix
          
prediction 1 2 3
         1 1 0 0
         2 0 3 0
         3 3 0 3
accuracy <- sum(diag(confMatrix)) / sum(confMatrix)
print(paste('Accuracy: ', accuracy))
[1] "Accuracy:  0.7"

For a single train-test split of [75 25], the model obtained a decent prediction accuracy of 0.7.

13.3 Ten-fold cross validation

train_control <- trainControl(method = "cv",  # Use cross validation
                              number = 10)    # Use 10 folds

# tune_grid = expand.grid(cp=c(0.0001)) # complexity parameter (size of tree)

validated_tree <- train(cluster ~ .,
                        data = clean_dataset,
                        method = "rpart",
                        trControl = train_control)

validated_tree 
CART 

49 samples
 3 predictor
 3 classes: '1', '2', '3' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 44, 44, 45, 45, 44, 43, ... 
Resampling results across tuning parameters:

  cp         Accuracy   Kappa     
  0.1000000  0.5750000  0.37451614
  0.1333333  0.4950000  0.25439230
  0.2333333  0.3883333  0.01176471

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.1.
validated_tree$resample
   Accuracy     Kappa Resample
1      0.40 0.2105263   Fold02
2      0.60 0.3333333   Fold01
3      0.50 0.2000000   Fold03
4      0.50 0.2500000   Fold06
5      0.60 0.4117647   Fold05
6      0.75 0.6363636   Fold04
7      0.80 0.6875000   Fold07
8      0.80 0.6875000   Fold10
9      0.40 0.1176471   Fold09
10     0.40 0.2105263   Fold08

However, the accuracy for ten-fold cross validation was low, at 0.58.
This is probably because the dataset is too small, and the model is overfitting. Overall, the model is unable to generalize to unseen data.